library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Read in data, cast as tibble
pitchers <-  read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   last_name = col_character(),
##   first_name = col_character(),
##   X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)

pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))

pitchers <- pitchers[order(pitchers$player_age),]

pitchers <- pitchers[order(pitchers$name),]



#Aggregate by age and find the mean OBP for each age
pitchers_aggregated_by_age <- pitchers %>%                                       
  group_by(player_age) %>%                        
  summarise_at(vars(on_base_percent),          
               list(OBP = mean))               

#The original idea was to graph the pitcher data by age, but we ran into a problem. Obviously if your OBP jumps too much, you get booted. So the average was relatively stable over time, except on the old end. 
ggplot(pitchers_aggregated_by_age, aes(player_age, OBP, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by age") +
  theme(plot.title = element_text(hjust = 0.5))

#The remedy is to find the average OBP by "year in the league".

#Add an empty column that we can populate in the next chunk
emptycolumn <- c("Years_In_Pitching")
pitchers[ , emptycolumn] <- NA


pitchers$Years_In_Pitching[1]<-1

i<-2
while (i < length(pitchers$Years_In_Pitching)){
  if(i==1){
    pitchers$Years_In_Pitching[i]=1
    i=i+1
    }
  else if(pitchers$name[i]==pitchers$name[i-1]){
    pitchers$Years_In_Pitching[i]=pitchers$Years_In_Pitching[i-1]+1
    i=i+1
    }
  else if(pitchers$name[i]!=pitchers$name[i-1]){
    pitchers$Years_In_Pitching[i] = 1
    i=i+1
  }
}

#Aggregate by Years In Pitching and find the mean OBP for each of these
pitchers_by_years_pitched <- pitchers %>%                                       
  group_by(Years_In_Pitching) %>%                        
  summarise_at(vars(on_base_percent),          
               list(OBP = mean))       

ggplot(pitchers_by_years_pitched, aes(Years_In_Pitching, OBP, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by Years in Pitching") +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing missing values (position_stack).

#Look at the trajectories of several careers.

AndyBenes <- pitchers %>% filter(name == "Andy Benes")

ggplot(AndyBenes, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by Years in Pitching") +
  theme(plot.title = element_text(hjust = 0.5))

AJBurnett <- pitchers %>% filter(name == "A.J. Burnett")

ggplot(AJBurnett, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by Years in Pitching") +
  theme(plot.title = element_text(hjust = 0.5))

AdamWainwright <- pitchers %>% filter(name== "Adam Wainwright")

ggplot(AdamWainwright, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by Years in Pitching") +
  theme(plot.title = element_text(hjust = 0.5))

AndyPettitte <- pitchers %>% filter(name== "Andy Pettitte")

ggplot(AndyPettitte, aes(Years_In_Pitching, on_base_percent, fill = "green")) +
  geom_bar(stat = "identity", show.legend = FALSE) + 
  ggtitle("OBP by Years in Pitching") +
  theme(plot.title = element_text(hjust = 0.5))

#Curious about the effect of age AND years in pitching on OBP

mylm <- lm(on_base_percent ~ player_age * Years_In_Pitching, data = pitchers)

#Analyze Findings
summary(mylm)
## 
## Call:
## lm(formula = on_base_percent ~ player_age * Years_In_Pitching, 
##     data = pitchers)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.096158 -0.016611  0.000262  0.017500  0.079295 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   3.191e-01  4.786e-03  66.681  < 2e-16 ***
## player_age                   -4.773e-05  1.702e-04  -0.280 0.779197    
## Years_In_Pitching            -3.686e-03  9.632e-04  -3.827 0.000132 ***
## player_age:Years_In_Pitching  8.487e-05  2.828e-05   3.001 0.002710 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02552 on 3320 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.009134,   Adjusted R-squared:  0.008238 
## F-statistic:  10.2 on 3 and 3320 DF,  p-value: 1.1e-06
summary(mylm)$r.squared 
## [1] 0.009133655
pitchers <-  read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   last_name = col_character(),
##   first_name = col_character(),
##   X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)

pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))

pitchers <- pitchers[order(pitchers$player_age),]

pitchers <- pitchers[order(pitchers$name),]

pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)



pitchers
## # A tibble: 3,325 x 28
##    last_name first_name  year player_age p_game p_formatted_ip p_total_pa  p_ab
##    <chr>     <chr>      <dbl>      <dbl>  <dbl>          <dbl>      <dbl> <dbl>
##  1 Burnett   A.J.        2001         24     27           173.        733   629
##  2 Burnett   A.J.        2002         25     31           204.        844   732
##  3 Burnett   A.J.        2005         28     32           209         873   775
##  4 Burnett   A.J.        2007         30     25           165.        691   611
##  5 Burnett   A.J.        2008         31     35           221.        957   849
##  6 Burnett   A.J.        2009         32     33           207         896   781
##  7 Burnett   A.J.        2010         33     33           186.        829   715
##  8 Burnett   A.J.        2011         34     33           190.        837   731
##  9 Burnett   A.J.        2012         35     31           202.        851   767
## 10 Burnett   A.J.        2013         36     30           191         801   714
## # ... with 3,315 more rows, and 20 more variables: p_total_hits <dbl>,
## #   p_single <dbl>, p_double <dbl>, p_triple <dbl>, p_home_run <dbl>,
## #   p_strikeout <dbl>, p_walk <dbl>, p_k_percent <dbl>, p_bb_percent <dbl>,
## #   batting_avg <dbl>, slg_percent <dbl>, on_base_percent <dbl>,
## #   on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## #   p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, decade <dbl>
#devtools::install_github("bokeh/rbokeh@v0.6.3")

library(rbokeh)
## 
## Attaching package: 'rbokeh'
## The following object is masked from 'package:readr':
## 
##     spec
## The following object is masked from 'package:ggplot2':
## 
##     arrow
library(quanteda)
library(tidyverse)

#Read in data, cast as tibble
pitchers <-  read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   last_name = col_character(),
##   first_name = col_character(),
##   X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)

pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))

pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))

pitchers <- pitchers[order(pitchers$player_age),]

pitchers <- pitchers[order(pitchers$name),]


#Add an empty column that we can populate in the next chunk
emptycolumn <- c("Years_In_Pitching")
pitchers[ , emptycolumn] <- NA


pitchers$Years_In_Pitching[1]<-1

i<-2
while (i < length(pitchers$Years_In_Pitching)){
  if(i==1){
    pitchers$Years_In_Pitching[i]=1
    i=i+1
    }
  else if(pitchers$name[i]==pitchers$name[i-1]){
    pitchers$Years_In_Pitching[i]=pitchers$Years_In_Pitching[i-1]+1
    i=i+1
    }
  else if(pitchers$name[i]!=pitchers$name[i-1]){
    pitchers$Years_In_Pitching[i] = 1
    i=i+1
  }
}

pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)

pitchers
## # A tibble: 3,325 x 29
##    last_name first_name  year player_age p_game p_formatted_ip p_total_pa  p_ab
##    <chr>     <chr>      <dbl>      <dbl>  <dbl>          <dbl>      <dbl> <dbl>
##  1 Burnett   A.J.        2001         24     27           173.        733   629
##  2 Burnett   A.J.        2002         25     31           204.        844   732
##  3 Burnett   A.J.        2005         28     32           209         873   775
##  4 Burnett   A.J.        2007         30     25           165.        691   611
##  5 Burnett   A.J.        2008         31     35           221.        957   849
##  6 Burnett   A.J.        2009         32     33           207         896   781
##  7 Burnett   A.J.        2010         33     33           186.        829   715
##  8 Burnett   A.J.        2011         34     33           190.        837   731
##  9 Burnett   A.J.        2012         35     31           202.        851   767
## 10 Burnett   A.J.        2013         36     30           191         801   714
## # ... with 3,315 more rows, and 21 more variables: p_total_hits <dbl>,
## #   p_single <dbl>, p_double <dbl>, p_triple <dbl>, p_home_run <dbl>,
## #   p_strikeout <dbl>, p_walk <dbl>, p_k_percent <dbl>, p_bb_percent <dbl>,
## #   batting_avg <dbl>, slg_percent <dbl>, on_base_percent <dbl>,
## #   on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## #   p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, Years_In_Pitching <dbl>,
## #   decade <dbl>
figure() %>%
  ly_points(x = on_base_percent, y = p_strikeout, color = Years_In_Pitching,
  data = pitchers, hover = list(name, year, on_base_percent, p_strikeout))
## Warning: `lang_args()` is deprecated as of rlang 0.2.0.
## Please use `call_args()` instead.
## This warning is displayed once per session.
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
#install.packages("ggpointdensity")
library(ggpointdensity)
## Warning: package 'ggpointdensity' was built under R version 4.0.3
library(ggplot2)
library(plotly) 
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(pitchers, aes(on_base_percent, p_strikeout, colour=decade))  
 ggplotly(p)
#install.packages("WVPlots")
#install.packages("GGally")
library(WVPlots) 
## Warning: package 'WVPlots' was built under R version 4.0.3
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.3
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
## The following object is masked from 'package:tidyr':
## 
##     unpack
## The following object is masked from 'package:tibble':
## 
##     view
library(quanteda)
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidyverse)

pitchers <-  read_csv("stats.csv")
## Warning: Missing column names filled in: 'X26' [26]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   last_name = col_character(),
##   first_name = col_character(),
##   X26 = col_logical()
## )
## See spec(...) for full column specifications.
pitchers <- tibble(pitchers)

pitchers <- pitchers %>% mutate(name=paste(first_name, last_name))

pitchers <- pitchers[order(pitchers$player_age),]

pitchers <- pitchers[order(pitchers$name),]

pitchers <- pitchers %>% mutate(decade = (year %/% 10)*10)
pitchers <- pitchers %>% mutate(sdecade = format(decade, digits = 4))
colnames(pitchers)
##  [1] "last_name"        "first_name"       "year"             "player_age"      
##  [5] "p_game"           "p_formatted_ip"   "p_total_pa"       "p_ab"            
##  [9] "p_total_hits"     "p_single"         "p_double"         "p_triple"        
## [13] "p_home_run"       "p_strikeout"      "p_walk"           "p_k_percent"     
## [17] "p_bb_percent"     "batting_avg"      "slg_percent"      "on_base_percent" 
## [21] "on_base_plus_slg" "isolated_power"   "p_earned_run"     "p_run"           
## [25] "p_balk"           "X26"              "name"             "decade"          
## [29] "sdecade"
colnames(pitchers)[colnames(pitchers) == "player_age"] <- "Player Age"
colnames(pitchers)[colnames(pitchers) == "p_strikeout"] <- "Strikeouts"
colnames(pitchers)[colnames(pitchers) == "p_walk"] <- "Walks"
colnames(pitchers)[colnames(pitchers) == "on_base_percent"] <- "OBP"
pitchers
## # A tibble: 3,325 x 29
##    last_name first_name  year `Player Age` p_game p_formatted_ip p_total_pa
##    <chr>     <chr>      <dbl>        <dbl>  <dbl>          <dbl>      <dbl>
##  1 Burnett   A.J.        2001           24     27           173.        733
##  2 Burnett   A.J.        2002           25     31           204.        844
##  3 Burnett   A.J.        2005           28     32           209         873
##  4 Burnett   A.J.        2007           30     25           165.        691
##  5 Burnett   A.J.        2008           31     35           221.        957
##  6 Burnett   A.J.        2009           32     33           207         896
##  7 Burnett   A.J.        2010           33     33           186.        829
##  8 Burnett   A.J.        2011           34     33           190.        837
##  9 Burnett   A.J.        2012           35     31           202.        851
## 10 Burnett   A.J.        2013           36     30           191         801
## # ... with 3,315 more rows, and 22 more variables: p_ab <dbl>,
## #   p_total_hits <dbl>, p_single <dbl>, p_double <dbl>, p_triple <dbl>,
## #   p_home_run <dbl>, Strikeouts <dbl>, Walks <dbl>, p_k_percent <dbl>,
## #   p_bb_percent <dbl>, batting_avg <dbl>, slg_percent <dbl>, OBP <dbl>,
## #   on_base_plus_slg <dbl>, isolated_power <dbl>, p_earned_run <dbl>,
## #   p_run <dbl>, p_balk <dbl>, X26 <lgl>, name <chr>, decade <dbl>,
## #   sdecade <chr>
ggpairs(pitchers, mapping = aes(color = sdecade, alpha=.6, legend.position = "left"), columns = c(4,14,15,20)) 

ggsave(file="Crossplot Matrix.png", width=8, height=5, dpi=500)